/*
    Copyright (c) 2006 Novell and contributors:
        robert@ocallahan.org
    
    Permission is hereby granted, free of charge, to any person
    obtaining a copy of this software and associated documentation
    files (the "Software"), to deal in the Software without
    restriction, including without limitation the rights to use,
    copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the
    Software is furnished to do so, subject to the following
    conditions:
    
    The above copyright notice and this permission notice shall be
    included in all copies or substantial portions of the Software.
    
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef DATABASE_H__
#define DATABASE_H__

/* 
 * This file defines the structure of a Chronicle log database. Databases are
 * generated by the indexer and read by the query engine (or potentially other
 * tools).
 * 
 * The database file has a header at the start of the file. The header
 * points to a directory, usually located at the end of the file. The directory
 * contains a list of named entries, each one pointing to some chunk
 * of the file. Typically each named entry corresponds to some "stream"
 * within the file. Many of the streams contain their own pointers to
 * other areas within the file, so not all of the file is allocated to
 * directory entries.
 * 
 * Everything is arranged so that the indexer only needs to append to the file,
 * to maximise apparent disk bandwidth. The only exception is the final update
 * of the header record.
 * 
 * The database file format is architecture dependent; the sizes of some
 * fields, and the struct layout rules, and endianness, depend on the
 * architecture. It's also architecture independent in the sense that you could
 * read the architecture field from the header record and use that as a guide to
 * interpreting the rest of the file, allowing any database to be read or
 * written on any architecture. However, the definitions in this C header file
 * are only useful for reading and writing databases for the architecture we are
 * building on.
 * 
 * Currently, whenever compressed data is indicated, the method used
 * is zlib deflate. The size of the compressed data is always obtainable
 * from outside the compressed data, usually stored alongside the file offset
 * of the compressed data.
 */

#include "arch.h"
#include "effects.h"

/**
 * Bump this 16-bit version identifier every time there is an incompatible
 * format change.
 */
#define CH_DB_VERSION 0x0001

/**
 * Version number indicating an incomplete database.
 */
#define CH_DB_VERSION_INCOMPLETE 0x0000

/**
 * Timestamps are currently a count of the number of instructions retired.
 */
typedef uint64_t CH_TStamp;

#define CH_ARCH_X86   0
#define CH_ARCH_AMD64 1

/** The header at the start of a database file. */
typedef struct {
  /** "ChronicleDB" (null padded) */
  char      magic[12];
  /** some CH_ARCH_ constant */
  uint8_t   architecture;
  uint8_t   is_little_endian;
  /** CH_DB_VERSION for new databases, 0 for a database that's still being written */
  uint16_t  version;
  /**
   * If nonzero, then the register reconstruction log includes the
   * dynamic offsets for bunched effects that need them.
   */
  uint8_t   dynamic_offsets_in_reg_log;
  uint8_t   have_reg_reads;
  uint8_t   have_mem_reads;
  /** log2 of the size of a page in the memory effect maps */
  uint8_t   effect_map_page_size_bits;
  /** reserved padding, must be zero */
  uint8_t   reserved_zero[4];

  /** The file offset of the directory area. */
  uint64_t  directory_offset;
  /** The file offset of the directory names area. */
  uint64_t  name_offset;
  /** Number of directory entries. */
  uint32_t  directory_count;
  /** Size of the directory names area in bytes. */
  uint32_t  name_size;

  /** Timestamp of the end of execution of the traced process. */ 
  CH_TStamp end_tstamp;
} CH_DBHeader;

/** A directory entry. */
typedef struct {
  /** The offset of the data from the start of the file. */
  uint64_t offset;
  uint64_t length;
  /** The offset of the entry name within the directory names area. */
  uint32_t name_offset;
} CH_DBDirEntry;

/**
 * The EFFECT_SET area contains an array of pointers to "effect sets".
 * Each effect set defines the memory and/or register effects for a collection
 * of instrumented blocks. See CH_SECTION_CODE_INFO.
 */
#define CH_SECTION_EFFECT_SET "EFFECT_SET"
typedef struct {
  uint64_t fileloc;
  uint32_t compressed_size;
} CH_DBEffectSetEntry;

/**
 * The CODE_INFO area contains the static information about each instrumented
 * code block. It is an array of CH_DBCodeInfoEntry records, one per
 * instrumented code block, indexed by code block ID. Each
 * CH_DBCodeInfoEntry has an indirect pointer to the effect data for the
 * code block, and the number of bunched effects and register effects for this
 * code block.
 * offset_in_effect_set is an offset into the decompressed effect set data.
 * At that offset is an array of num_bunched_effects CH_BunchedEffect structs,
 * followed by an array of num_reg_effects CH_RegEffect structs.
 */
#define CH_SECTION_CODE_INFO "CODE_INFO"
typedef struct {
  uint32_t effect_set;
  uint32_t offset_in_effect_set;
  uint16_t num_bunched_effects;
  uint16_t num_reg_effects;
} CH_DBCodeInfoEntry;

/**
 * The REG_LOG area is an array of entries each describing a chunk of the
 * register log. Each chunk describes one register epoch.
 */
#define CH_SECTION_REG_LOG "REG_LOG"
typedef struct {
  /** The timestamp of the start of the epoch for this chunk. */
  CH_TStamp first_tstamp;
  /** A bitmask where bit i is zero if register i was definitely not modified
   * during this epoch. */
  uint64_t  registers_maybe_modified;
  /** The file offset of the register log chunk. */
  uint64_t  reg_log_chunk_fileloc;
  /** The maximum value held by the stack pointer at any time during this chunk. */
  uint64_t  SP_max;
  /** The compressed size of the register log chunk. */
  uint32_t  reg_log_chunk_compressed_size;
  /** An ID for this thread (unique among all threads live during the epoch. */
  uint32_t  pthread_cookie;
} CH_DBRegLogEntry;

/*
 * A register log chunk consists of one of CH_DBRegLogChunkX86 or
 * CH_DBRegLogChunkAMD64 followed by
 * 
 *   uint8_t  instructions_retired[num_codes_executed];
 *   uint32_t code_index[num_codes_executed]; // uint32_t aligned
 *   struct {
 *     // only if dynamic_offsets_in_reg_log is set!
 *     uintptr_t bunched_effect_dynamic_offsets[]; // pointer aligned,
 *       // one per bunched effect that has has_dynamic_offset set
 *
 *     uint8_t reg_log_data[]; // pointer aligned,
 *       // as much as is indicated by the register effects
 *   } log_data[num_codes_executed];
 *
 * In the log_data, if the code_index was executed before in this chunk,
 * the values stored are the uintptr_t-wise differences between the words in
 * this log_data and the words in the previous log_data for this code_index.
 * This compresses better.
 *
 * The log data is packed to satisfy alignment constraints (up to pointer
 * alignment). The algorithm is in ch_main.c:trace_record_allocate. The
 * algorithm is:
 *   var freeblock[]; // initially zero, indexed by size
 *   var freespace;   // initially some nonzero pointer
 *   allocate_bytes(n) for n a power of 2 {
 *     if (n >= sizeof(pointer)) {
 *       // Just take the space from the master allocation block
 *       result = freespace;
 *       freespace += n;
 *     } else {
 *       if (freeblock[n]) {
 *         // Use the existing free block
 *         result = freeblock[n];
 *         freeblock[n] = 0;
 *       } else {
 *         // Allocate a new block of size n*2; use the first half for the
 *         // space we need now, save the second half for later
 *         result = allocate_bytes(n*2);
 *         freeblock[n] = result + n;
 *       }
 *     }
 *     return result;
 *   }
 */

typedef struct {
  CH_X86Context initial_context;
  uint32_t      num_codes_executed;
} CH_DBRegLogChunkX86;

typedef struct {
  CH_AMD64Context initial_context;
  uint32_t        num_codes_executed;
} CH_DBRegLogChunkAMD64;

#ifdef CH_X86
#if __WORDSIZE == 64
typedef CH_DBRegLogChunkAMD64 CH_DBRegLogChunk;
#else
typedef CH_DBRegLogChunkX86   CH_DBRegLogChunk;
#endif
#else
#error Unknown architecture
#endif

/**
 * The memory effect maps are stored in areas with directory entries named
 * according to the map type: currently MEM_READ, MEM_WRITE and ENTER_SP.
 * 
 * A memory effect map area starts with a CH_EffectDBMap describing the size of
 * the page hash table and the number of pages. The hash table itself
 * follows: each entry in the table is a uint32_t, either zero or the index
 * of a page entry. The page entries themselves follow the hash table. The hash
 * table allows for efficient lookup of per-page history without requiring
 * exorbitant storage for large sparse address spaces. Pages that do not
 * appear in the hash table have no associated effects.
 * 
 * The page_table_size must be a power of 2. The hash function mapping page
 * numbers to hash table entries is as follows:
 *
 * uint32_t hash_page_num(uintptr_t page_num, uint32_t size) {
 *   uint64_t v1 = (uint32_t)(page_num >> 32)*31901901;
 *   uint64_t v2 = ((uint32_t)page_num)*39019017;
 *   uint64_t m1 = v1 ^ (v1 >> 8) ^ (v1 >> 16) ^ (v1 >> 24) ^ (v1 >> 32)
 *  ^ (v1 >> 40);
 *   uint64_t m2 = v2 ^ (v2 >> 8) ^ (v2 >> 16) ^ (v2 >> 24) ^ (v2 >> 32)
 * ^ (v2 >> 40);
 *   uint32_t mix = (uint32_t)m1 ^ (uint32_t)m2;
 *   return mix & (size - 1);
 * }
 */
typedef struct {
  uint32_t page_entry_count;
  uint32_t page_table_size;
} CH_EffectDBMap;

/**
 * Each page entry has pointers to an array of bitmap entries and an array
 * of history entries for this page. The arrays are in temporal order.
 */
typedef struct {
  uint64_t page_num;
  uint64_t history_entry_fileloc;
  uint64_t bitmap_entry_fileloc;
  uint32_t history_entry_count;
  uint32_t bitmap_entry_count;
} CH_EffectPageEntry;

/**
 * Each bitmap entry is just a pointer to the compressed bitmap data. The
 * bitmap data is run-length-encoded and then gzipped. The run-length-encoding
 * is just a sequence of 16-bit unsigned values interpreted as:
 * <number of 0 bits> <number of 1 bits> <number of 0 bits> ... etc
 */
typedef struct {
  uint64_t bitmap_fileloc;
  uint32_t bitmap_compressed_size;
} CH_EffectBitmapEntry;

/**
 * Each history entry describes one memory effect epoch.
 */
typedef struct {
  /** The timestamp of the first effect in this epoch. */
  CH_TStamp first_tstamp;
  /** The timestamp of the last effect in this epoch. */
  CH_TStamp last_tstamp;
  /** Pointer to the access list for this epoch. */
  uint64_t  access_fileloc;
  /** Number of accesses in the list. */
  uint32_t  access_list_count;
  /** Total amount of data associated with the accesses. */
  uint32_t  access_data_count;
  /** Compressed size of the access list. */
  uint32_t  access_compressed_size;
  /**
   * Index of the access bitmap associated with this epoch; used to look up
   * the bitmap list for this page.
   */
  uint32_t  access_bitmap_index;
  /**
   * The true value of tstamp_offset for the last access item. Storing this
   * here is redundant but allows efficient reverse traversal of the
   * access list.
   */
  uint32_t  final_tstamp_offset;
  /**
   * The true value of 'offset' for the last access item. Storing this here
   * is redundant but allows efficient reverse traversal of the access list.
   */
  uint16_t  final_offset;
} CH_EffectHistoryEntry;

/**
 * The access list is an array of CH_EffectEffectItems. If there is data
 * associated with the effects then the data follows the CH_EffectEffectItems array.
 */
typedef struct {
  /**
   * For better compression, this is stored as the delta between the value for
   * the last item and this one ... for the first item, we store the true value
   * (which is always zero in fact!).
   * This is the offset of this effect's timestamp from the first_tstamp
   * for this epoch.
   */
  uint32_t tstamp_offset;
  /**
   * For better compression, this is stored as the delta between the value for
   * the last item and this one ... for the first item, we store the true value.
   * This is the offset of this effect's memory address from the start of
   * the page.
   */
  uint16_t offset;
  /** 
   * The length of the memory range affected by this effect. A value of zero
   * means 2^16.
   */
  uint16_t length;

  /** The bunched effect details. */
  CH_BunchedEffectAtoms atoms;
} CH_EffectItem;

/**
 * The ADDRESS_MAP_EVENTS area contains an array of CH_DBAddrMapEntry records
 * describing all address map changes, in temporal order. The sequence starts
 * with a list of records with timestamp 1, describing the initial
 * address space map. When memory is mapped, the event describes how to
 * obtain the new contents of the memory via one of a few options:
 * -- contents may be written to the log database as a sequence of memory writes
 * -- contents may be zero
 * -- contents may be found in a file that the tracer deemed "immutable"
 * (e.g., an executable file not writeable by the user)
 */
#define CH_SECTION_ADDR_MAP "ADDRESS_MAP_EVENTS"
typedef struct {
  /**
   * The timestamp of the address map change. Address map changes are
   * considered to happen before any instruction execution with the same
   * timestamp.
   */
  CH_TStamp tstamp;
  /** Virtual address of the memory region affected. */
  uint64_t  address;
  /** Virtual length of the memory region affected. */
  uint64_t  length;
  /** 1 if the region will be mapped, 0 if it will be unmapped. */
  uint8_t   is_mapped;
  /** 1 if the region will be readable. */
  uint8_t   is_read;
  /** 1 if the region will be writeable. */
  uint8_t   is_write;
  /** 1 if the region will be executable. */
  uint8_t   is_execute;
  /** 1 if the region is backed by a file. */
  uint8_t   is_file;
  /**
   * If set, don't try to get debug info for this mapping, usually because
   * the mapping belongs to valgrind-specific memory whose symbols should not
   * be pulled in.
   */
  uint8_t   suppress_debug_info;
  /**
   * If set, the log database will contain memory write effects that initialize
   * the memory area with its contents.
   */
  uint8_t   contents_will_follow;
  /** If set, the memory area is being set to zero. */
  uint8_t   contents_set_zero;
  /** If set, the contents can be read from a file (details below). */
  uint8_t   contents_from_file;
  /**
   * If set, then this mapping operation is not changing the contents of
   * the memory (e.g. it might just be changing the permissions).
   */
  uint8_t   contents_unchanged;
  uint8_t   reserved_zero[3];

  /**
   * The following are only set for file mappings, otherwise they
   * will be zero. They point to a filename in the log database, in UTF8.
   * For file mappings these may still be zero if the filename is not
   * available. The device and inode numbers of a file can be checked against
   * those recorded to ensure this is the same file.
   */
  uint32_t  filename_len;
  uint64_t  filename_fileloc;
  
  /** The device number for the mapping. */
  uint64_t  device;
  /** The inode number for the mapping. */
  uint64_t  inode;
  /** The offset in the file to the start of the mapped area. */
  uint64_t  offset;
} CH_DBAddrMapEntry;

#endif
